Análisis de https://www.nature.com/articles/srep00196.pdf
Podemos usar read_lines_chunked si el archivo original es grande:
library(tidyverse)
limpiar <- function(lineas,...){
str_split(lineas, ',') %>%
keep(~.x[1] == 'EastAsian') %>%
map(~.x[-1]) %>% # quitar tipo de cocina
map(~.x[nchar(.x) > 0]) # quitar elementos vac{ios}
}
filtrado <- read_lines_chunked('../../datos/recetas/srep00196-s3.csv',
skip = 1, callback = ListCallback$new(limpiar))
recetas <- filtrado %>% flatten
library(arules)
length(recetas)
[1] 2512
## No hacer mucho más chico que este soporte, pues tenemos relativamente
## pocas transacciones:
pars <- list(support = 0.05, target = 'frequent itemsets',
ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 125
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [41 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 done [0.00s].
sorting transactions ... done [0.00s].
writing ... [628 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
length(ap_recetas)
[1] 628
Vemos los items frecuentes
frecs <- ap_recetas %>% subset(size(.) == 1 ) %>% sort(by = 'support') %>%
DATAFRAME
DT::datatable(frecs %>% mutate_if(is.numeric, function(x) round(x, 3)))
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Y ahora examinamos combinaciones frecuentes de distintos tamaños
ap_recetas %>%
subset(size(.) == 2) %>%
subset(support > 0.20) %>%
sort(by = 'support') %>%
inspect
Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:
ap_recetas %>%
subset(size(.) == 4) %>%
subset(support > 0.10) %>%
sort(by = 'support') %>%
inspect
pars <- list(support = 0.01, confidence = 0.10,
target = 'rules',
ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 25
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[242 item(s), 2512 transaction(s)] done [0.00s].
sorting and recoding items ... [88 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 8 done [0.01s].
writing ... [50181 rule(s)] done [0.01s].
creating S4 object ... done [0.01s].
agregar_hyperlift <- function(reglas, trans){
quality(reglas) <- cbind(quality(reglas),
hyper_lift = interestMeasure(reglas, measure = "hyperLift",
transactions = trans))
reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)
library(arulesViz)
Loading required package: grid
Registered S3 method overwritten by 'seriation':
method from
reorder.hclust gclus
Registered S3 method overwritten by 'data.table':
method from
print.data.table
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.1 & support > 0.1 & confidence > 0.40)
length(reglas_1)
[1] 213
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 %>% sort(by = 'hyper_lift'))
plot(reglas_1 %>% subset(support > 0.2), engine = "plotly")
To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
library(tidygraph)
Attaching package: ‘tidygraph’
The following object is masked from ‘package:stats’:
filter
library(ggraph)
frecs <-
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% data.frame
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
mutate(centrality = centrality_degree(mode = "all"))
set.seed(881)
ggraph(graph_1, layout = 'fr') +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph()
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.5 & confidence > 0.1)
length(reglas_1)
[1] 11190
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
length(reglas_tam_2)
[1] 134
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 %>% DATAFRAME %>% rename(from=LHS, to=RHS) %>% as_data_frame
`as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
[90mThis warning is displayed once per session.[39m
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) %>%
mutate(centrality = centrality_degree(mode = "all"))
ggraph(graph_1, layout = 'fr', start.temp=100) +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph()
Exportamos para examinar en Gephi:
write_csv(df_reglas %>% rename(source=from, target=to) %>%
select(-count),
path='reglas.csv')
La combinación corn y starch puede deberse en parte a una separación incorrecta en el procesamiento de los datos (corn starch o maizena convertido en dos ingredientes, corn y starch):
df_reglas %>% filter(from == "{corn}", to == "{starch}")
La confianza es considerablemente alta, aunque tenemos pocos datos de esta combinación. Podemos examinar algunos ejemplos:
recetas %>% keep(~ "tomato" %in% .x & "corn" %in% .x) %>% head(10)
[[1]]
[1] "tomato" "vinegar" "pork" "celery_oil" "leek" "corn" "black_pepper"
[8] "pepper" "ginger" "pea" "garlic" "soybean" "soy_sauce" "chicken_broth"
[15] "wine"
[[2]]
[1] "tomato" "vinegar" "pepper" "celery_oil" "corn" "cayenne" "pork" "garlic"
[9] "soybean" "vegetable" "coriander" "rice" "soy_sauce"
[[3]]
[1] "tomato" "vinegar" "pork" "celery_oil" "soy_sauce" "ginger" "garlic" "sherry"
[9] "corn"
[[4]]
[1] "pepper" "celery_oil" "starch" "corn" "ginger" "garlic" "soybean"
[8] "tomato" "vinegar" "beef" "soy_sauce" "cayenne" "scallion" "bell_pepper"
[15] "vegetable_oil" "rice" "wine"
[[5]]
[1] "tomato" "vinegar" "pork" "celery_oil" "beef" "soy_sauce" "ginger" "garlic"
[9] "corn" "wine"
[[6]]
[1] "tomato" "vinegar" "pepper" "lemon_juice" "celery_oil" "sake" "corn" "pork"
[9] "ginger" "honey" "garlic" "soybean" "rice" "soy_sauce"
[[7]]
[1] "tomato" "garlic" "onion" "bacon" "corn" "cayenne" "egg"
[[8]]
[1] "pork" "green_bell_pepper" "celery_oil" "starch" "corn"
[6] "garlic" "tomato" "vinegar" "onion" "soy_sauce"
[11] "cider" "scallion" "celery" "pineapple" "vegetable_oil"
[16] "egg"
[[9]]
[1] "tomato" "vinegar" "pepper" "celery_oil" "roasted_pork" "soy_sauce" "ginger"
[8] "honey" "garlic" "cinnamon" "soybean" "sherry" "corn" "oyster"
[[10]]
[1] "cane_molasses" "tomato" "pork" "celery_oil" "vinegar" "soy_sauce" "pepper"
[8] "ginger" "garlic" "corn"